{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Set-up-directories\" data-toc-modified-id=\"Set-up-directories-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Set up directories</a></span></li><li><span><a href=\"#Scrape-html-files-containing-acts\" data-toc-modified-id=\"Scrape-html-files-containing-acts-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Scrape html files containing acts</a></span></li><li><span><a href=\"#Compute-moving-averages\" data-toc-modified-id=\"Compute-moving-averages-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Compute moving averages</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook scrapes wikipedia html files to create a data frame containing all the UK parliamentary acts related to schooling and education."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:20.571767Z",
     "start_time": "2021-05-15T14:17:19.874874Z"
    }
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import glob\n",
    "import re\n",
    "import csv\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up directories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ensure CWD is the scripts folder of the rep directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:20.580930Z",
     "start_time": "2021-05-15T14:17:20.573425Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Volumes/GoogleDrive-107745440581041782819/My Drive/00_Researching/16_SocialScientization/-04_EJS/00_replication/01_scripts'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:20.585792Z",
     "start_time": "2021-05-15T14:17:20.583362Z"
    }
   },
   "outputs": [],
   "source": [
    "directory = os.path.dirname(os.getcwd()) + \"/\"\n",
    "data = directory + \"00_data/\"\n",
    "acts_folder = data + \"00_dv/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Scrape html files containing acts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:21.746777Z",
     "start_time": "2021-05-15T14:17:20.588108Z"
    }
   },
   "outputs": [],
   "source": [
    "acts = set()\n",
    "files = glob.glob(acts_folder+\"*htm\")\n",
    "for file in files:\n",
    "    with open(file, 'r') as f: \n",
    "        content = f.read()\n",
    "        soup = BeautifulSoup(content, 'lxml-xml')\n",
    "        a_s = soup.find_all(\"a\")\n",
    "        pattern = \"Act\\s{1}\\d{4}\"\n",
    "        for a in a_s:\n",
    "            title = a.get(\"title\")\n",
    "            if title is not None:\n",
    "                act = re.search(pattern, title)\n",
    "                if act is not None:\n",
    "                    acts.add(title.replace(\" (page does not exist)\", \"\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:21.789182Z",
     "start_time": "2021-05-15T14:17:21.748521Z"
    }
   },
   "outputs": [],
   "source": [
    "with open(acts_folder+\"sch_acts.csv\", \"w\", encoding='utf-8') as f:\n",
    "    f_writer = csv.writer(f, delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL)\n",
    "    f_writer.writerow(['year', 'act', 'sch'])\n",
    "    i = -1\n",
    "    for act in acts:\n",
    "        i += 1\n",
    "        year = re.search(\"(?:18|19)\\d{2}\", act)\n",
    "        if (year is not None) and (int(year.group()) < 1915): \n",
    "            if (\"School\" in act) or (\"Educat\" in act):\n",
    "                sch = 1\n",
    "            else:\n",
    "                sch = 0\n",
    "            f_writer.writerow([int(year.group()), act, sch])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export to stata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:21.801177Z",
     "start_time": "2021-05-15T14:17:21.790735Z"
    }
   },
   "outputs": [],
   "source": [
    "sch_acts = pd.read_csv(acts_folder+\"sch_acts.csv\")\n",
    "sch_acts.to_stata(acts_folder+\"sch_acts.dta\", write_index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute moving averages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:21.823467Z",
     "start_time": "2021-05-15T14:17:21.802878Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "n_sch_acts = []\n",
    "years = []\n",
    "for year, df in sch_acts.groupby('year'):\n",
    "    years.append(year)\n",
    "    n_sch_acts.append(df['sch'].sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:21.831704Z",
     "start_time": "2021-05-15T14:17:21.826922Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# set averaged lag period\n",
    "lag = 5\n",
    "mov_avg = []\n",
    "for i in range(len(n_sch_acts)+1):\n",
    "    # keep within range\n",
    "    if i < lag: \n",
    "        mov_avg.append(sum(n_sch_acts[i-i:i+1])/len(n_sch_acts[i-i:i+1]))\n",
    "    if i > lag:\n",
    "        mov_avg.append(sum(n_sch_acts[i-lag:i])/len(n_sch_acts[i-lag:i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-05-15T14:17:21.863927Z",
     "start_time": "2021-05-15T14:17:21.834333Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(114, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>mov_avg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1801</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1802</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1803</td>\n",
       "      <td>0.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1804</td>\n",
       "      <td>0.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1805</td>\n",
       "      <td>0.200000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>1910</td>\n",
       "      <td>1.800000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>1911</td>\n",
       "      <td>1.600000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>1912</td>\n",
       "      <td>1.600000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>1913</td>\n",
       "      <td>1.800000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>1914</td>\n",
       "      <td>2.800000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>114 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     year   mov_avg\n",
       "0    1801  0.000000\n",
       "1    1802  0.000000\n",
       "2    1803  0.333333\n",
       "3    1804  0.250000\n",
       "4    1805  0.200000\n",
       "..    ...       ...\n",
       "109  1910  1.800000\n",
       "110  1911  1.600000\n",
       "111  1912  1.600000\n",
       "112  1913  1.800000\n",
       "113  1914  2.800000\n",
       "\n",
       "[114 rows x 2 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "moving_averages = pd.DataFrame()\n",
    "moving_averages['year'] = years\n",
    "moving_averages['mov_avg'] = mov_avg\n",
    "moving_averages.to_stata(acts_folder+\"mov_avg_acts.dta\", write_index=False)\n",
    "print(moving_averages.shape)\n",
    "moving_averages"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
